1 Introduction

This analysis looks at the field data collected for the following 2017 MLRA project: - MLRA 111B - Glynwood B-slope Erosion; Northeastern IN

Spatially disaggregate the existing SSURGO polygons for Glynwood B-slope map units using ArcSIE, in order to separate different soil erosion phases. The current SSURGO maps join issues at the SSA boundaries, due to different erosion phases. This project is deemed relevant due to the current interest in Soil Health. Distinguishing the difference in erosion phases may have minimal impact on the majority of soil interpretations, but is believed to be significant in distinguishing crop yields.

2 Data Preparation

library(aqp)
library(soilDB)

library(reshape2)
library(ggplot2)
library(gridExtra)
library(knitr)

library(cluster)
library(caret)
library(party)
library(vegan)

library(rgdal)
library(sp)
library(sf)
library(raster)
library(mapview)
library(gdalUtils)

2.1 Point Data

# load pedons from NASIS
gp <- fetchNASIS()

vars <- c("peiid", "pedon_id", "taxonname", "x", "y", "describer", "erocl")
s <- site(gp)[vars]
h <- horizons(gp)


# duplicate field data from the original spreadsheet, originally came from "PEDON - ArcSIE Data Dump 11-FIN v2.0" report, diagnostic data was populated in the pedon field measured property table
g_vars <- c("peiid")
A_vars <- c("hzname", "hzdept", "clay", "texture", "m_hue", "m_value", "m_chroma")
Bt_vars <- c("hzdept", "clay", "texture", "m_value", "m_chroma")
carb_vars <- c("hzdept", "effervescence")
solum_vars <- c("hzdept")

h2 <- by(h, h[g_vars], function(x) data.frame(
  # grouping variable
  x[g_vars][1, ],
  # A horizon data
  x[A_vars][1, ],
  # Bt horizon data
  x[grepl("Bt", x$hzname), Bt_vars, ][1, ],
  # CaCO3 data
  x[x$effervescence %in% c("strong", "violent"), carb_vars][1,],
  # solum depth
  x[grepl("^C|^2C|^3C", x$hzname), solum_vars][1]
  ))
h2 <- do.call("rbind", h2)

names(h2) <- c(g_vars, "hzname", "A_hzthk", "claytotest", "texture", "mxhue", "mxvalue", "mxchroma", "hzthk", "firstbtclay", "firstbttexture", "firstbtmxvalue", "firstbtmxchroma", "CaCO3Dp", "effervescence", "SolumDp")
names(s)[c(2:5, 7)]  <- c("upedonid", "soilname", "long", "lat", "EroClassFD")

h2 <- within(h2,{
             CaCO3Dp[is.na(CaCO3Dp)] <- 200
             SolumDp[is.na(SolumDp)] <- 200
             })

gw1 <- merge(s, h2, by = "peiid", all.x = TRUE)
gw1 <- gw1[complete.cases(gw1[c("lat", "long")]), ]
load(file = paste0(ownCloud, "201711FIN001_glynwood_pol.RData"))

# load field data
gw2 <- read.csv(paste0(ownCloud, "Pts_gnbero_27Jan17.csv"))
vars <- c("upedonid", "EroClassSIE", "relpos", "SlopeSIE", "wetness", "PlanCrv", "ProfCrv", "maxcrv", "mincrv")
gw <- merge(gw1, gw2[vars], by = "upedonid", all.x = TRUE)

# extract mupolygon data from field coordinates and merge with mapunit data from NASIS
vars <- c("AREASYMBOL", "nationalmusym", "MUNAME", "upedonid")
test <- data.frame(R11FIN_pol_int)[vars]
gw <- merge(gw, test, by = "upedonid", all.x = TRUE)

# erosion labels
ero_labels <- c("undisturbed", "slight", "moderate", "severe")

# tranform the original dataset
gw <- within(gw, {
  # convert erosion codes to labels
  EroClassFD  = factor(EroClassFD, levels = 0:3, labels = ero_labels)
  EroClassSIE = factor(EroClassSIE, levels = 0:3, labels = ero_labels)
  EroClassFD2 = ifelse(EroClassFD == "severe", "severe", "slight")
  
  # extract erosion phases from mapunit names
  EroClassNASIS = NA
  EroClassNASIS[grepl("eroded", MUNAME)] = "eroded"
  EroClassNASIS[grepl("sev.|severely", MUNAME)] = "sev.eroded"
  EroClassNASIS[!grepl("eroded", MUNAME)] = "non.eroded"
  
  # extract landform from mapunit names
  landform = NA
  landform[grepl("ground moraine", MUNAME)] = "ground"
  landform[grepl("end moraine", MUNAME)] =  "end"
  
  # correlate field taxonnames to mapunit components
  soilname2 = soilname
  soilname2 = ifelse(soilname2 %in% c("Glynwood", "Morley", "Shinrock", "Rawson", "Mississinewa"), "Glynwood", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Blount", "Elliott"), "Blount", soilname2)
  soilname2 = ifelse(soilname2 %in% c("Pewamo", "Pandora", "Mermill"), "Pewamo", soilname2)
  soilname3 = paste0(soilname2, ifelse(soilname2 == "Glynwood", paste0("-", EroClassFD2), ""))
  
  # difference between A and Bt horizons
  clay_dif = firstbtclay - claytotest
  tex_dif  = firstbttexture == texture
  dep_dif  = hzthk - 15
  value_dif = firstbtmxchroma - mxvalue
  chroma_dif = firstbtmxchroma - mxchroma
  })

# convert transformed field dataset to a spatial object
gw_sp <- gw
coordinates(gw_sp) <- ~ long + lat
proj4string(gw_sp) <- CRS("+init=epsg:4326")

gw_sf <- st_as_sf(gw_sp)

gw_sp <- spTransform(gw_sp, CRS("+init=epsg:5070"))

2.2 Spatial Data

The geodata from the Glynwood points was extracted from several rasters at various resolutions. The data using to generate the ArcSIE model came from a DEM with a resolution of 15-feet. The other used came from the 10-meter USGS NED, which was primarily resampled from LiDAR.

# Extract data from rasters
# NW files
fd <- paste0(geodata, "project_data/11FIN/PointDataEval/")
dd <- c("slope10",
        "procrv10",
        "plncrv10",
        "maxcrv10",
        "mincrv10",
        "relpos_r5",
        "wetness_mp"
        )
fp <- paste0(fd, "Mosaic_NW_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_nw <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_nw <- subset(gd_nw, !is.na(slope10))

# SE files
fp <- paste0(fd, "Mosaic_SE_pts/Derivatives/", dd, "/", "w001001.adf")
rs <- stack(fp)
names(rs) <- dd
proj4string(rs) <- CRS("+init=epsg:2965")
gd_se <- extract(rs, gw_sp, df = TRUE, sp = TRUE)@data
gd_se <- subset(gd_se, !is.na(slope10))

gd15ft <- rbind(gd_nw, gd_se)
rm(gd_nw, gd_se)
write.csv(gd15ft, file = paste0(ownCloud, "geodata_15ft_extract.csv"))


# Region 11 files

subset_rasters <- function(input, output) {
  cat(paste0(input, "\n"))
  gdal_translate(
    src_dataset = input,
    dst_dataset = output,
    projwin = c(bb[1,1], bb[2,2], bb[1,2], bb[2,1]),
    of = "GTiff",
    a_nodata = -99999,
    overwrite = TRUE,
    verbose = TRUE
    )
  }

warp_rasters <- function(input, output){
  cat(paste0(input,"\n"))
  gdalwarp(
    srcfile = input,
    dstfile = output,
    te = bb,
    s_srs = CRSargs(CRS("+init=epsg:5070")),
    t_srs = CRSargs(CRS("+init=epsg:5070")),
    r = "bilinear",
    tr = c(10, 10),
    of = "GTiff",
    overwrite = TRUE,
    verbose = TRUE 
    )
  }

fd <- paste0(geodata, "project_data/11FIN/sdat/")
dd2 <- c("ned10m_11FIN.sdat",
        "ned10m_11FIN_aspect5.sdat",
        "ned10m_11FIN_slope5.sdat",
        "ned10m_11FIN_cupro5.sdat",
        "ned10m_11FIN_cutan5.sdat",
        "ned30m_11FIN_mvalleys.sdat",
        "ned30m_11FIN_wetness.sdat",
        "ned30m_11FIN_z2stream.sdat",
        "nlcd30m_11FIN_lulc2011.sdat"
        )
dd_names <- c("elev", "aspect5", "slope5", "kp5", "kt5", "mvalley", "wetness2", "z2streams", "lulc")
dd <- paste0(fd, dd2)

input <- dd
output <- paste0(geodata, "project_data/11FIN/201711FIN001_Glynwood/", gsub(".sdat", ".tif", dd2))
bb <- st_bbox(st_transform(gw_sf, crs = "+init=epsg:5070"))

mapply(subset_rasters, input, output)
mapply(subset_rasters, 
       input = paste0(geodata, "project_data/11FIN/nlcd30m_11FIN_lulc2011.tif"), 
       output = paste0(geodata, "project_data/11FIN/201711FIN001_Glynwood/nlcd30m_11FIN_lulc2011.tif")
       )
mapply(warp_rasters, input = output, output = gsub(".tif", "2.tif", gsub("30m", "10m", output)))

dd <- output
 
rs10m <- stack(dd[grepl("10m", dd)])
names(rs10m) <- dd_names[1:5]
rs30m <- stack(dd[grepl("30m", dd)])
names(rs30m) <- dd_names[6:9]

rs10m <- stack(gsub(".tif", "2.tif", gsub("30m", "10m", output)))
names(rs10m) <- dd_names

gd10m <- as.data.frame(extract(rs10m, gw_sp, df = TRUE, sp = TRUE))
gd30m <- as.data.frame(extract(rs30m, gw_sp, df = TRUE))
gw <- cbind(gd10m, gd30m[, -1])
rm(gd10m, gd30m)


# Save data
save(gw, gw_sf, gw_sp, ero_labels, file = paste0(ownCloud, "201711FIN001_glynwood_geodata.RData"))

3 Plot Pedon Coordinates, SSA, and Series Extent

# Load cached dataset
load(paste0(ownCloud, "201711FIN001_glynwood_pol.RData"))
load(paste0(ownCloud, "201711FIN001_glynwood_geodata.RData"))


# Create interactive map
mapView(gw_series) + ssa + gw_sf

4 Map Units vs Field Observations

vars <- c("EroClassFD", "EroClassNASIS", "nationalmusym", "AREASYMBOL")
gw_sub <- gw[vars]
gw_dup <- gw[vars]

# Frequency of field observation vs map unit
# Duplicate the data for each REASYMBOL and relabel MLRA
gw_dup["AREASYMBOL"] <- "MLRA"
gw_dup <- rbind(gw_sub, gw_dup)
gw_dup$natmuSsaEro <-  with(gw_dup,
                            paste0(nationalmusym, "-", AREASYMBOL, "-", EroClassNASIS)
                            )
test <- xtabs(~ natmuSsaEro + EroClassFD, data = gw_dup)
kable(addmargins(test, margin = 2), caption = "Frequence by MUSYM-SSA-EROSION")
Frequence by MUSYM-SSA-EROSION
undisturbed slight moderate severe Sum
2psgt-IN009-sev.eroded 0 0 1 0 1
2psgt-MLRA-sev.eroded 0 0 1 0 1
2t6ll-IN009-sev.eroded 13 15 17 3 48
2t6ll-IN053-sev.eroded 4 13 18 13 48
2t6ll-IN075-sev.eroded 6 8 4 5 23
2t6ll-IN179-sev.eroded 1 9 6 10 26
2t6ll-MLRA-sev.eroded 24 45 45 31 145
2t6lm-IN009-sev.eroded 2 11 10 1 24
2t6lm-IN053-sev.eroded 2 4 8 11 25
2t6lm-IN075-sev.eroded 3 1 11 9 24
2t6lm-IN179-sev.eroded 9 8 0 13 30
2t6lm-MLRA-sev.eroded 16 24 29 34 103
2v4bn-IN069-eroded 5 4 7 6 22
2v4bn-IN179-eroded 0 3 4 6 13
2v4bn-MLRA-eroded 5 7 11 12 35
2v4bp-IN179-eroded 0 3 0 2 5
2v4bp-MLRA-eroded 0 3 0 2 5
2v4bt-IN035-eroded 1 3 3 5 12
2v4bt-MLRA-eroded 1 3 3 5 12
5dsh-IN179-non.eroded 0 0 1 0 1
5dsh-MLRA-non.eroded 0 0 1 0 1
5dtg-IN179-non.eroded 0 0 0 1 1
5dtg-MLRA-non.eroded 0 0 0 1 1
5jjt-IN035-sev.eroded 1 1 2 9 13
5jjt-MLRA-sev.eroded 1 1 2 9 13
725n-IN075-non.eroded 0 0 2 0 2
725n-MLRA-non.eroded 0 0 2 0 2
7264-IN075-non.eroded 0 1 0 0 1
7264-MLRA-non.eroded 0 1 0 0 1
NA-MLRA-non.eroded 5 5 3 28 41
NA-NA-non.eroded 5 5 3 28 41
kable(round(prop.table(test, 1) * 100), caption = "Percent by MUSYM-SSA-EROSION")
Percent by MUSYM-SSA-EROSION
undisturbed slight moderate severe
2psgt-IN009-sev.eroded 0 0 100 0
2psgt-MLRA-sev.eroded 0 0 100 0
2t6ll-IN009-sev.eroded 27 31 35 6
2t6ll-IN053-sev.eroded 8 27 38 27
2t6ll-IN075-sev.eroded 26 35 17 22
2t6ll-IN179-sev.eroded 4 35 23 38
2t6ll-MLRA-sev.eroded 17 31 31 21
2t6lm-IN009-sev.eroded 8 46 42 4
2t6lm-IN053-sev.eroded 8 16 32 44
2t6lm-IN075-sev.eroded 12 4 46 38
2t6lm-IN179-sev.eroded 30 27 0 43
2t6lm-MLRA-sev.eroded 16 23 28 33
2v4bn-IN069-eroded 23 18 32 27
2v4bn-IN179-eroded 0 23 31 46
2v4bn-MLRA-eroded 14 20 31 34
2v4bp-IN179-eroded 0 60 0 40
2v4bp-MLRA-eroded 0 60 0 40
2v4bt-IN035-eroded 8 25 25 42
2v4bt-MLRA-eroded 8 25 25 42
5dsh-IN179-non.eroded 0 0 100 0
5dsh-MLRA-non.eroded 0 0 100 0
5dtg-IN179-non.eroded 0 0 0 100
5dtg-MLRA-non.eroded 0 0 0 100
5jjt-IN035-sev.eroded 8 8 15 69
5jjt-MLRA-sev.eroded 8 8 15 69
725n-IN075-non.eroded 0 0 100 0
725n-MLRA-non.eroded 0 0 100 0
7264-IN075-non.eroded 0 100 0 0
7264-MLRA-non.eroded 0 100 0 0
NA-MLRA-non.eroded 12 12 7 68
NA-NA-non.eroded 12 12 7 68

Several of counties phased severely eroded, are not dominanted by field observations classified as severely eroded.

5 Accuracy Assessment of the ArcSIE Predictions

cm <- confusionMatrix(data = gw$EroClassSIE, reference = gw$EroClassFD)
print(cm)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed           0      0        0      0
##   slight                9      6       10      9
##   moderate             27     35       51     37
##   severe               12     31       21     45
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3481          
##                  95% CI : (0.2937, 0.4057)
##     No Information Rate : 0.3106          
##     P-Value [Acc > NIR] : 0.09339         
##                                           
##                   Kappa : 0.0853          
##  Mcnemar's Test P-Value : 7.635e-15       
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.0000       0.08333          0.6220
## Specificity                      1.0000       0.87330          0.5308
## Pos Pred Value                      NaN       0.17647          0.3400
## Neg Pred Value                   0.8362       0.74517          0.7832
## Prevalence                       0.1638       0.24573          0.2799
## Detection Rate                   0.0000       0.02048          0.1741
## Detection Prevalence             0.0000       0.11604          0.5119
## Balanced Accuracy                0.5000       0.47832          0.5764
##                      Class: severe
## Sensitivity                 0.4945
## Specificity                 0.6832
## Pos Pred Value              0.4128
## Neg Pred Value              0.7500
## Prevalence                  0.3106
## Detection Rate              0.1536
## Detection Prevalence        0.3720
## Balanced Accuracy           0.5888
test <- as.data.frame(cm$table)

ggplot(test, aes(x = Reference, y = Freq, fill = Prediction)) +
  geom_bar(stat = "identity") +
  coord_flip()

The accuracy of the current ArcSIE model appears to be low, according to several metrics. The positive predictive value for the severe class is < 50%.

5.1 Boxplots

soil_vals <- c("hzthk", "SolumDp", "CaCO3Dp", "claytotest", "firstbtclay", "mxvalue", "mxchroma")
soil_dif_vals <- c("clay_dif", "tex_dif", "dep_dif", "value_dif", "chroma_dif")
geo_vals1 <- c("SlopeSIE", "ProfCrv", "PlanCrv", "relpos", "wetness")
geo_vals2 <- c("slope5", "kt5", "kp5", "z2streams", "wetness2", "mvalley")

vals <- c(soil_vals, soil_dif_vals, geo_vals1, geo_vals2)
gw <- gw[complete.cases(gw[c("EroClassFD", soil_vals)]), ]

gw_lo1 <- melt(gw, id.vars = c("EroClassFD", "landform"), measure.vars = vals)
gw_lo2 <- melt(gw, id.vars = c("EroClassSIE", "landform"), measure.vars = vals)

names(gw_lo1)[1] <- "EroClass"
gw_lo1$method <- "FD"
names(gw_lo2)[1] <- "EroClass"
gw_lo2$method <- "SIE"
gw_lo <- rbind(gw_lo1, gw_lo2)
gw_lo <- subset(gw_lo, !is.na(EroClass))

gw_lo <- na.exclude(gw_lo)
ggplot(gw_lo1, aes(x = EroClass, y = value, color = landform)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) +
  coord_flip()

An exploratory analysis shows a considerable amount of overlap exists between the field determined (FD) erosion classes and measurable soil properties. In comparison the FD and SIE (Soil Inference Engine) erosion classes show different patterns within the boxplots, further suggesting that the SIE classes aren’t capturing the field observations accurately. The most important feature to highlight is that the trends between the SIE classes and digital elevation model (DEM) derivatives (i.e. slope) don’t match those observed for the FD classes. This mismatch suggests that the membership functions for the SIE classes are a poor fit, and should be redefined to more accurately represent the relationship between the FD classes and DEM derivatives.

5.2 Scatterplots

soil_vals2 <- c("hzthk", "SolumDp", "CaCO3Dp", "claytotest", "firstbtclay") # excluded color, only observed a narrow range thus small differences swamp everthing else
vals <- c(soil_vals2)

test <- gw[, vals]
test_d <- daisy(scale(test), metric = "gower")
test_mds <- metaMDS(test_d, distance = "gower", autotransform = FALSE, trace = FALSE)
test_pts <- cbind(as.data.frame(test_mds$points), EroClassFD = gw$EroClassFD)

g1 <- ggplot(gw, aes(x = claytotest, y = hzthk, color = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  xlim(c(max(gw$claytotest), min(gw$claytotest))) +
  theme(aspect.ratio = 1)
g2 <- ggplot(test_pts, aes(x = MDS1, y = MDS2, color = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

According to the scatterplot above it appears that only the severe and slight classes are separatable. The moderate erosion class seems to overlap the most with slight. The overlap in the FD classes is likely due to bias within and between the soil scientists who collected the data. Both the 15-feet and 10-meter DEM derivatives were evaluated, but the results are similar.

5.3 Classification Tree

test <- subset(gw, !is.na(EroClassFD))

test_ct <- ctree(EroClassFD ~ ., data = test[, c("EroClassFD", soil_vals)])
plot(test_ct)

cm <- confusionMatrix(data = predict(test_ct, type = "response"), reference = test$EroClassFD)
print(cm)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          42      5        6      1
##   slight                8     67        8      1
##   moderate              1     13       58      3
##   severe                0      2       23    109
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7954          
##                  95% CI : (0.7491, 0.8366)
##     No Information Rate : 0.3285          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7199          
##  Mcnemar's Test P-Value : 0.001127        
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      0.8235        0.7701          0.6105
## Specificity                      0.9595        0.9346          0.9325
## Pos Pred Value                   0.7778        0.7976          0.7733
## Neg Pred Value                   0.9693        0.9240          0.8640
## Prevalence                       0.1470        0.2507          0.2738
## Detection Rate                   0.1210        0.1931          0.1671
## Detection Prevalence             0.1556        0.2421          0.2161
## Balanced Accuracy                0.8915        0.8524          0.7715
##                      Class: severe
## Sensitivity                 0.9561
## Specificity                 0.8927
## Pos Pred Value              0.8134
## Neg Pred Value              0.9765
## Prevalence                  0.3285
## Detection Rate              0.3141
## Detection Prevalence        0.3862
## Balanced Accuracy           0.9244

An analysis of the EroClassFD above with a classification tree is an attempt to discern the hierachical structuce within the data. The results show Ap thickness (hzthk) and clay content (claytotest) are the first splits. The trees structure follows the logic described in the erosion indicators guide developed for this project. The overall accuracy for the tree is 0.8.

6 Hierachical Clusters

In order to see if more separation can be achieved amongst the erosion classes a hierachical classifition was peformed. Four unsupervised classes were chosen and manually matched to the FD classes.

test_c <- hclust(test_d, method = "ward.D")
plot(test_c, labels = gw$upedonid)
rect.hclust(test_c, k = 4)

clusters <- cbind(gw, 
                  test_pts[, 1:2], 
                  clusters = factor(cutree(test_c, k = 4),
                                    levels = c(2, 3, 1, 4),
                                    labels = ero_labels
                                    )
                  )
clusters <- cbind(gw, 
                  test_pts[, 1:2], 
                  clusters = factor(cutree(test_c, k = 4),
                                    levels = c(4, 3, 2, 1),
                                    labels = ero_labels
                                    )
                  )

xtabs(~ EroClassFD + clusters, data = clusters)
##              clusters
## EroClassFD    undisturbed slight moderate severe
##   undisturbed          32     16        3      0
##   slight               11     53       23      0
##   moderate              5     20       53     17
##   severe                0      2       40     72

6.1 Scatter Plots

g1 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = EroClassFD)) +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
g2 <- ggplot(clusters, aes(x = MDS1, y = MDS2, col = clusters), main = "test") +
  geom_point(cex = 2, alpha = 0.75) +
  theme(aspect.ratio = 1)
grid.arrange(g1, g2, ncol = 2)

In comparison the hierarchical clusters have less overlap when viewed along the multidimensional scaled axes, but still does not seem to separate the moderate class.

6.2 Box Plots

gw_lo1 <- melt(gw, id.vars = "EroClassFD", measure.vars = c(soil_vals, geo_vals2))
gw_lo3 <- melt(clusters, id.vars = "clusters", measure.vars = c(soil_vals, geo_vals2))

names(gw_lo1)[1] <- "EroClass"
names(gw_lo3)[1] <- "EroClass"
gw_lo1$method <- "FD"
gw_lo3$method <- "clusters"
gw_lo <- rbind(gw_lo1, gw_lo3)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) + 
  coord_flip()

A comparison of the FD and cluster classes shows that the clusters do a good job replicating the patterns found in the boxplots.

6.3 Classification Tree

test2 <- ctree(clusters ~ ., data = clusters[, c("clusters", soil_vals)])
plot(test2)

confusionMatrix(data = predict(test2, type = "response"), reference = clusters$clusters)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    undisturbed slight moderate severe
##   undisturbed          48      0        0      0
##   slight                0     82       16      0
##   moderate              0      6       97      7
##   severe                0      3        6     82
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8905          
##                  95% CI : (0.8528, 0.9213)
##     No Information Rate : 0.3429          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8502          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: undisturbed Class: slight Class: moderate
## Sensitivity                      1.0000        0.9011          0.8151
## Specificity                      1.0000        0.9375          0.9430
## Pos Pred Value                   1.0000        0.8367          0.8818
## Neg Pred Value                   1.0000        0.9639          0.9072
## Prevalence                       0.1383        0.2622          0.3429
## Detection Rate                   0.1383        0.2363          0.2795
## Detection Prevalence             0.1383        0.2824          0.3170
## Balanced Accuracy                1.0000        0.9193          0.8791
##                      Class: severe
## Sensitivity                 0.9213
## Specificity                 0.9651
## Pos Pred Value              0.9011
## Neg Pred Value              0.9727
## Prevalence                  0.2565
## Detection Rate              0.2363
## Detection Prevalence        0.2622
## Balanced Accuracy           0.9432

In comparision, the classification tree for the clusters splits primarily on the CaCO3 and solum depths, presumable due to the narrow range in Ap thickness.

7 Statistical Modeling

Below several statistical models were evaluated to see if a more accurate model could be developed.

7.1 FD Classes vs DEM Derivatives

test3 <- ctree(EroClassFD ~ ., data = gw[, c("EroClassFD", geo_vals2)])
plot(test3)

cm_ct <- confusionMatrix(data = predict(test3, type = "response"), reference = gw$EroClassFD)
round(cm_ct$overall, 2)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.39           0.20           0.34           0.45           0.33 
## AccuracyPValue  McnemarPValue 
##           0.01           0.00
test3 <- cforest(as.factor(EroClassFD) ~ ., data = gw[, c("EroClassFD", geo_vals2)])
varimp(test3)
##        slope5           kt5           kp5     z2streams      wetness2 
##  0.0475590551  0.0181417323  0.0037007874  0.0020944882  0.0192440945 
##       mvalley 
## -0.0006141732
cm_cf <-confusionMatrix(data = predict(test3, type = "response", OOB = TRUE), reference = gw$EroClassFD)
round(cm_cf$overall, 2)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.39           0.15           0.34           0.44           0.33 
## AccuracyPValue  McnemarPValue 
##           0.01           0.25

Neither a classification tree or forest were capiable of achieving a higher accuracy than the SIE model.

7.2 Clusters vs DEM Derivatives

test4 <- ctree(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
plot(test4)

cm_ct <- confusionMatrix(data = predict(test4, type = "response"), reference = clusters$clusters)
round(cm_ct$overall, 2)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.41           0.20           0.36           0.46           0.34 
## AccuracyPValue  McnemarPValue 
##           0.01           0.00
test4 <- cforest(clusters ~ ., data = clusters[, c("clusters", geo_vals2)])
varimp(test4)
##        slope5           kt5           kp5     z2streams      wetness2 
##  1.352756e-02  3.285039e-02 -1.385827e-03  2.503937e-03  4.724409e-05 
##       mvalley 
##  8.645669e-03
cm_cf <- confusionMatrix(data = predict(test4, type = "response", OOB=TRUE), reference = clusters$clusters)
round(cm_cf$overall, 2)
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##           0.35           0.10           0.30           0.41           0.34 
## AccuracyPValue  McnemarPValue 
##           0.34           0.13

Neither a classification tree or forest were capiable of achieving a higher accuracy than the SIE model.

7.3 Soil series and phases

Thus far efforts to model the erosion classes has been lackluster. This appears to be largely due to the overlap in the erosion classes and subtle relief. Given these challenges it is probably more realistic to focus on distinguishing the severely eroded class separately, and develop individual models for the minor components. ry and model the soil components and phases separately.

# create a logical variable for the soilname3 == "Glynwood-severe"
gw$gw_severe <- ifelse(gw$soilname3 == "Glynwood-severe", TRUE, FALSE)

# Random Forest
test4 <- cforest(as.factor(gw_severe) ~ elev + slope5 + kt5 + kp5 + wetness2 + mvalley + z2streams, data = gw)
sort(varimp(test4), decreasing = TRUE)
##      slope5    wetness2        elev         kt5   z2streams     mvalley 
## 0.031039370 0.023023622 0.021322835 0.018881890 0.005795276 0.003921260 
##         kp5 
## 0.002094488
confusionMatrix(data = predict(test4, type = "response", OOB = TRUE), reference = gw$gw_severe, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   209   74
##      TRUE     26   38
##                                           
##                Accuracy : 0.7118          
##                  95% CI : (0.6611, 0.7589)
##     No Information Rate : 0.6772          
##     P-Value [Acc > NIR] : 0.09243         
##                                           
##                   Kappa : 0.2575          
##  Mcnemar's Test P-Value : 2.602e-06       
##                                           
##             Sensitivity : 0.3393          
##             Specificity : 0.8894          
##          Pos Pred Value : 0.5937          
##          Neg Pred Value : 0.7385          
##              Prevalence : 0.3228          
##          Detection Rate : 0.1095          
##    Detection Prevalence : 0.1844          
##       Balanced Accuracy : 0.6143          
##                                           
##        'Positive' Class : TRUE            
## 
# Logisitic Regression
test3 <- glm(as.factor(gw_severe) ~ elev + slope5 + kt5, data = gw, family = "binomial", na.action = na.exclude)
confusionMatrix(data = predict(test3, type = "response") > 0.4, reference = gw$gw_severe, positive = "TRUE")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   190   49
##      TRUE     44   63
##                                           
##                Accuracy : 0.7312          
##                  95% CI : (0.6812, 0.7772)
##     No Information Rate : 0.6763          
##     P-Value [Acc > NIR] : 0.01568         
##                                           
##                   Kappa : 0.3789          
##  Mcnemar's Test P-Value : 0.67830         
##                                           
##             Sensitivity : 0.5625          
##             Specificity : 0.8120          
##          Pos Pred Value : 0.5888          
##          Neg Pred Value : 0.7950          
##              Prevalence : 0.3237          
##          Detection Rate : 0.1821          
##    Detection Prevalence : 0.3092          
##       Balanced Accuracy : 0.6872          
##                                           
##        'Positive' Class : TRUE            
## 
summary(test3)
## 
## Call:
## glm(formula = as.factor(gw_severe) ~ elev + slope5 + kt5, family = "binomial", 
##     data = gw, na.action = na.exclude)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.625  -0.852  -0.636   1.110   2.299  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.940289   1.870051  -4.246 2.18e-05 ***
## elev         0.021535   0.006583   3.271 0.001070 ** 
## slope5       0.389700   0.101587   3.836 0.000125 ***
## kt5          0.088141   0.026213   3.363 0.000772 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 435.70  on 345  degrees of freedom
## Residual deviance: 395.53  on 342  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 403.53
## 
## Number of Fisher Scoring iterations: 3
gw$predicted <- predict(test3, type = "response") > 0.4
gw_lo1 <- melt(gw, id.vars = "gw_severe", measure.vars = vals)
gw_lo2 <- melt(gw, id.vars = "predicted", measure.vars = vals)
gw_lo2 <- na.exclude(gw_lo2)

names(gw_lo1)[1] <- "EroClass"
gw_lo1$method <- "FD"
names(gw_lo2)[1] <- "EroClass"
gw_lo2$method <- "GLM"
gw_lo <- rbind(gw_lo1, gw_lo2)

ggplot(gw_lo, aes(x = EroClass, y = value)) +
  geom_boxplot() +
  facet_wrap(~ paste(variable, method), scales="free", ncol = 4) +
  coord_flip()

predfun <- function(model, data) {
  v <- predict(model, data, type = "response")
  cbind(
    p = as.vector(v$fit)
    )
}
r <- predict(rs10m, test3, fun = predfun, index = 1:2, progress = "text")
writeRaster(r[[1]], "C:/workspace/severe_erosion.tif", overwrite = TRUE, progress = "text")

r <-predict(rs10m, test4, type='response', progress='text')
writeRaster(r[[1]], "C:/workspace/severe_erosion_cf.tif", overwrite = TRUE, progress = "text")

8 Summary

8.1 Issues

  • The field data does not confirm the map units phased severely eroded.
  • The previous analysis did not exclude the minor components.
  • An exploratory analysis illustrated a considerable amount of overlap exists between the field determined (FD) erosion classes and measurable soil properties. However, the FD erosion classes were similarly predictive as the results of a cluster analysis. The classes derived by cluster analysis appeared to overlap slightly and moderately eroded, and were best separated by depth to CaCO3, while the FD erosion classes were best split on A horizon thickness.
  • The accuracy of the current ArcSIE model appears to be low, according to several metrics.
  • A preliminary analysis found a random forest model to be 20% more accurate than the ArcSIE model.
  • Re-delineating the SSURGO map units will most likely result in numerous small delineations.

8.2 Recommendations

  • If disaggregating the SSURGO polygons is deemed impractical or inaccurate, re-label the Glynwood B-slope map units (e.g. severely eroded) to the appropriate map unit concept (e.g moderately eroded).
  • Evaluate ways to improve the accuracy of the ArcSIE model, or test alternative models such as random forests.
  • Conduct a field review May 15th (?) to evaluate the erosion phase concepts and spatial model.
  • Evaluate the effect of lumping erosion classes (e.g. slightly and moderately).
  • Access yield data from a producer in order to validate the hypothesis that the erosion phases help explain yield variability.
  • Load the pedons directly from NASIS. Make sure the user project id is populated in the site observation table for easy querying.
  • Validate the model with other pedons from NASIS.
  • Reclassify the points using the erosion indicators guide.